#!/usr/bin/env python
import os
import sys
from collections import Counter
from numpy import *


counts = {}

datasets = ("MiSeq", "HiSeq", "CAGE")
for dataset in datasets:
    libraries = []
    directory = "/osc-fs_home/mdehoon/Data/CASPARs/%s/Mapping/" % dataset
    filenames = os.listdir(directory)
    for filename in filenames:
        library, extension = filename.split(".")
        assert extension == "bam"
        libraries.append(library)
    libraries.sort()
    counts[dataset] = {}
    for library in libraries:
        counts[dataset][library] = {}
        filename = "firstnucleotide.%s.%s.txt" % (dataset, library)
        if dataset == "HiSeq" and library == "t01_r3":
            # Skipping HiSeq negative control library using water as input material
            print("Skipping", filename)
            continue
        print("Reading", filename)
        handle = open(filename)
        line = next(handle)
        assert line.startswith("#")
        words = line[1:].strip().split("\t")
        assert len(words) == 2
        assert words[0] == "annotation"
        terms = words[1].split(":")
        assert terms[0].split(",") == [dataset, library]
        assert terms[1].split(",") == ['A', 'C', 'G', 'T', 'a', 'c', 'g', 't']
        for line in handle:
            words = line.strip().split("\t")
            assert len(words) == 2
            annotation = words[0]
            row = array(words[1].split(","), int)
            counts[dataset][library][annotation] = row
        handle.close()

filename = "firstnucleotide.txt"
print("Writing", filename)
handle = open(filename, 'w')
words = ["#annotation"]
for dataset in counts:
    for library in counts[dataset]:
        word = "%s,%s:A,C,G,T,a,c,g,t" % (dataset, library)
        words.append(word)
    line = "\t".join(words) + "\n"
handle.write(line)
annotations = counts['MiSeq']['t00_r1'].keys()
for annotation in annotations:
    words = [annotation]
    for dataset in counts:
        for library in counts[dataset]:
            A, C, G, T, a, c, g, t = counts[dataset][library].get(annotation, [0]*8)
            word = "%d,%d,%d,%d,%d,%d,%d,%d" % (A, C, G, T, a, c, g,t)
            words.append(word)
    line = "\t".join(words) + "\n"
    handle.write(line)
handle.close()
